all_times <- list() # store the time for each chunk
knitr::knit_hooks$set(time_it = local({
now <- NULL
function(before, options) {
if (before) {
now <<- Sys.time()
} else {
res <- difftime(Sys.time(), now, units = "secs")
all_times[[options$label]] <<- res
}
}
}))
knitr::opts_chunk$set(
tidy = TRUE,
tidy.opts = list(width.cutoff = 95),
fig.width = 10,
message = FALSE,
warning = FALSE,
time_it = TRUE,
error = TRUE
)
The joint analysis of two or more single-cell datasets poses unique challenges. In particular, identifying cell populations that are present across multiple datasets can be problematic under standard workflows. Seurat v4 includes a set of methods to match (or ‘align’) shared cell populations across datasets. These methods first identify cross-dataset pairs of cells that are in a matched biological state (‘anchors’), can be used both to correct for technical differences between datasets (i.e. batch effect correction), and to perform comparative scRNA-seq analysis of across experimental conditions.
Below, we demonstrate methods for scRNA-seq integration as described in Stuart*, Butler* et al, 2019 to perform a comparative analysis of human immune cells (PBMC) in either a resting or interferon-stimulated state.
The following tutorial is designed to give you an overview of the kinds of comparative analyses on complex cell types that are possible using the Seurat integration procedure. Here, we address a few key goals:
For convenience, we distribute this dataset through our SeuratData package.
options(SeuratData.repo.use = "http://satijalab04.nygenome.org")
library(Seurat)
options(Seurat.object.assay.version = "v5")
library(SeuratData)
library(patchwork)
# install dataset
InstallData("ifnb")
# load dataset
LoadData("ifnb")
## Error in slot(object = object, name = s): no slot of name "images" for this object of class "Seurat"
ifnb <- UpdateSeuratObject(ifnb)
ifnb[["RNA"]] <- CreateAssay5Object(ifnb[["RNA"]]@counts)
# split the dataset into layers (stim and CTRL)
ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim)
# normalize and identify variable features for each dataset independently
ifnb <- NormalizeData(ifnb)
ifnb <- FindVariableFeatures(ifnb, selection.method = "vst", nfeatures = 2000)
features <- VariableFeatures(ifnb)
# these two now are run before
ifnb <- ScaleData(ifnb)
ifnb <- RunPCA(ifnb)
# # select features that are repeatedly variable across datasets for integration features <-
# SelectIntegrationFeatures(object.list = ifnb.list)
ifnb
We then identify anchors using the
FindIntegrationAnchors() function (not any more), which
takes a list of Seurat objects as input, and use these anchors to
integrate the two layers together with
IntegrateLayers().
ifnb <- IntegrateLayers(object = ifnb, method = CCAIntegration, features = features, verbose = F)
Now we can run a single integrated analysis on all cells!
# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
# Run the standard workflow for visualization and clustering use integrated.dr here instead of
# pca
ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30)
ifnb <- FindNeighbors(ifnb, reduction = "integrated.dr", dims = 1:30)
ifnb <- FindClusters(ifnb, resolution = 0.5)
# Visualization
p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim")
p2 <- DimPlot(ifnb, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2
To visualize the two conditions side-by-side, we can use the
split.by argument to show each condition colored by
cluster.
DimPlot(ifnb, reduction = "umap", split.by = "stim")
To identify canonical cell type marker genes that are conserved
across conditions, we provide the FindConservedMarkers()
function. This function performs differential gene expression testing
for each dataset/group and combines the p-values using meta-analysis
methods from the MetaDE R package. For example, we can calculated the
genes that are conserved markers irrespective of stimulation condition
in cluster 6 (NK cells).
# For performing differential expression after integration, we switch back to the original
# data
DefaultAssay(ifnb) <- "RNA"
# Join Data Layers across stimualtions
ifnb[["RNA"]] <- JoinLayers(ifnb[["RNA"]], search = "data", new = "data")
nk.markers <- FindConservedMarkers(ifnb, ident.1 = 6, grouping.var = "stim", verbose = FALSE)
head(nk.markers)
## CTRL_p_val CTRL_avg_log2FC CTRL_pct.1 CTRL_pct.2 CTRL_p_val_adj
## GNLY 0 6.012755 0.946 0.046 0
## FGFBP2 0 3.253231 0.503 0.021 0
## CLIC3 0 3.480735 0.605 0.024 0
## CTSW 0 3.025603 0.541 0.030 0
## KLRD1 0 2.803233 0.510 0.019 0
## KLRC1 0 2.615312 0.391 0.003 0
## STIM_p_val STIM_avg_log2FC STIM_pct.1 STIM_pct.2 STIM_p_val_adj
## GNLY 0.000000e+00 5.792059 0.949 0.061 0.000000e+00
## FGFBP2 4.908844e-168 2.190945 0.268 0.015 6.898399e-164
## CLIC3 0.000000e+00 3.551895 0.627 0.031 0.000000e+00
## CTSW 0.000000e+00 3.162748 0.602 0.035 0.000000e+00
## KLRD1 0.000000e+00 2.868744 0.554 0.027 0.000000e+00
## KLRC1 0.000000e+00 2.539733 0.379 0.006 0.000000e+00
## max_pval minimump_p_val
## GNLY 0.000000e+00 0
## FGFBP2 4.908844e-168 0
## CLIC3 0.000000e+00 0
## CTSW 0.000000e+00 0
## KLRD1 0.000000e+00 0
## KLRC1 0.000000e+00 0
We can explore these marker genes for each cluster and use them to annotate our clusters as specific cell types.
FeaturePlot(ifnb, features = c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2",
"PPBP"), min.cutoff = "q9")
ifnb <- RenameIdents(ifnb, `0` = "CD14 Mono", `1` = "CD4 Naive T", `2` = "CD4 Memory T", `3` = "CD16 Mono",
`4` = "B", `5` = "CD8 T", `6` = "NK", `7` = "T activated", `8` = "DC", `9` = "B Activated",
`10` = "Mk", `11` = "pDC", `12` = "Eryth", `13` = "Mono/Mk Doublets", `14` = "HSPC")
DimPlot(ifnb, label = TRUE)
The DotPlot() function with the split.by
parameter can be useful for viewing conserved cell type markers across
conditions, showing both the expression level and the percentage of
cells in a cluster expressing any given gene. Here we plot 2-3 strong
marker genes for each of our 14 clusters.
Idents(ifnb) <- factor(Idents(ifnb), levels = c("HSPC", "Mono/Mk Doublets", "pDC", "Eryth", "Mk",
"DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", "CD4 Naive T",
"CD4 Memory T"))
markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
"CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
"GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ", "PRSS57")
DotPlot(ifnb, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8, split.by = "stim") +
RotatedAxis()
library(ggplot2)
plot <- DotPlot(ifnb, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 6, split.by = "stim") +
RotatedAxis()
# ggsave(filename = '../output/images/pbmc_alignment.jpg', height = 7, width = 12, plot =
# plot, quality = 50)
Now that we’ve aligned the stimulated and control cells, we can start to do comparative analyses and look at the differences induced by stimulation. One way to look broadly at these changes is to plot the average expression of both the stimulated and control cells and look for genes that are visual outliers on a scatter plot. Here, we take the average expression of both the stimulated and control naive T cells and CD14 monocyte populations and generate the scatter plots, highlighting genes that exhibit dramatic responses to interferon stimulation.
library(ggplot2)
library(cowplot)
theme_set(theme_cowplot())
t.cells <- subset(ifnb, idents = "CD4 Naive T")
Idents(t.cells) <- "stim"
avg.t.cells <- as.data.frame(log1p(AverageExpression(t.cells, verbose = FALSE)$RNA))
avg.t.cells$gene <- rownames(avg.t.cells)
cd14.mono <- subset(ifnb, idents = "CD14 Mono")
Idents(cd14.mono) <- "stim"
avg.cd14.mono <- as.data.frame(log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA))
avg.cd14.mono$gene <- rownames(avg.cd14.mono)
genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8")
p1 <- ggplot(avg.t.cells, aes(CTRL, STIM)) + geom_point() + ggtitle("CD4 Naive T Cells")
p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE)
p2 <- ggplot(avg.cd14.mono, aes(CTRL, STIM)) + geom_point() + ggtitle("CD14 Monocytes")
p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE)
p1 + p2
As you can see, many of the same genes are upregulated in both of these cell types and likely represent a conserved interferon response pathway.
Because we are confident in having identified common cell types
across condition, we can ask what genes change in different conditions
for cells of the same type. First, we create a column in the meta.data
slot to hold both the cell type and stimulation information and switch
the current ident to that column. Then we use FindMarkers()
to find the genes that are different between stimulated and control B
cells. Notice that many of the top genes that show up here are the same
as the ones we plotted earlier as core interferon response genes.
Additionally, genes like CXCL10 which we saw were specific to monocyte
and B cell interferon response show up as highly significant in this
list as well.
ifnb$celltype.stim <- paste(Idents(ifnb), ifnb$stim, sep = "_")
ifnb$celltype <- Idents(ifnb)
Idents(ifnb) <- "celltype.stim"
b.interferon.response <- FindMarkers(ifnb, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE)
head(b.interferon.response, n = 15)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## ISG15 9.059333e-159 4.5698269 0.998 0.238 1.273108e-154
## IFIT3 4.791846e-154 4.4791700 0.965 0.051 6.733981e-150
## IFI6 7.806881e-152 4.2289881 0.963 0.076 1.097101e-147
## ISG20 1.061756e-149 2.9198403 1.000 0.672 1.492086e-145
## IFIT1 1.946349e-139 4.1101556 0.908 0.032 2.735204e-135
## MX1 1.243713e-123 3.2589732 0.908 0.113 1.747790e-119
## LY6E 9.532583e-120 3.1248078 0.896 0.147 1.339614e-115
## TNFSF10 2.678440e-112 3.8106081 0.787 0.020 3.764012e-108
## IFIT2 1.779710e-109 3.6693485 0.789 0.032 2.501027e-105
## B2M 3.573225e-99 0.6249356 1.000 1.000 5.021453e-95
## IRF7 3.158083e-95 2.6051975 0.843 0.191 4.438053e-91
## PLSCR1 6.649893e-94 2.7815365 0.792 0.118 9.345094e-90
## CXCL10 3.138646e-86 5.3307595 0.651 0.010 4.410739e-82
## UBE2L6 1.553836e-83 2.1249493 0.857 0.299 2.183606e-79
## PSMB9 5.270716e-78 1.6498113 0.937 0.559 7.406937e-74
Another useful way to visualize these changes in gene expression is
with the split.by option to the FeaturePlot()
or VlnPlot() function. This will display FeaturePlots of
the list of given genes, split by a grouping variable (stimulation
condition here). Genes such as CD3D and GNLY are canonical cell type
markers (for T cells and NK/CD8 T cells) that are virtually unaffected
by interferon stimulation and display similar gene expression patterns
in the control and stimulated group. IFI6 and ISG15, on the other hand,
are core interferon response genes and are upregulated accordingly in
all cell types. Finally, CD14 and CXCL10 are genes that show a cell type
specific interferon response. CD14 expression decreases after
stimulation in CD14 monocytes, which could lead to misclassification in
a supervised analysis framework, underscoring the value of integrated
analysis. CXCL10 shows a distinct upregulation in monocytes and B cells
after interferon stimulation but not in other cell types.
FeaturePlot(ifnb, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 3, cols = c("grey",
"red"))
plots <- VlnPlot(ifnb, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "celltype",
pt.size = 0, combine = FALSE)
wrap_plots(plots = plots, ncol = 1)
# saveRDS(ifnb, file = '../output/ifnb.rds')
In Hafemeister and Satija, 2019, we introduced an improved method for the normalization of scRNA-seq, based on regularized negative binomial regression. The method is named ‘sctransform’, and avoids some of the pitfalls of standard normalization workflows, including the addition of a pseudocount, and log-transformation. You can read more about sctransform in the manuscript or our SCTransform vignette.
Below, we demonstrate how to modify the Seurat integration workflow for datasets that have been normalized with the sctransform workflow. The commands are largely similar, with a few key differences:
SCTransform(),
instead of NormalizeData() prior to integrationPrepSCTIntegration() function prior to
identifying anchorsFindIntegrationAnchors(), and
IntegrateData(), set the normalization.method
parameter to the value SCT.ScaleData() functionLoadData("ifnb")
## Error in slot(object = object, name = s): no slot of name "images" for this object of class "Seurat"
ifnb <- UpdateSeuratObject(ifnb)
ifnb[["RNA"]] <- CreateAssay5Object(ifnb[["RNA"]]@counts)
## Error in CreateAssay5Object(ifnb[["RNA"]]@counts): no slot of name "counts" for this object of class "Assay5"
ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim)
## Error in slot(object = object, name = "layers")[[layer]][features, cells, : invalid or not-yet-implemented 'Matrix' subsetting
ifnb <- SCTransform(ifnb)
ifnb <- RunPCA(ifnb)
ifnb <- IntegrateLayers(object = ifnb, method = CCAIntegration, normalization.method = "SCT", verbose = F)
ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30)
p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim")
p2 <- DimPlot(ifnb, reduction = "umap", group.by = "seurat_annotations", label = TRUE, repel = TRUE)
p1 + p2
Now that the datasets have been integrated, you can follow the previous steps in this vignette identify cell types and cell type-specific responses.
# write.csv(x = t(as.data.frame(all_times)), file =
# '../output/timings/seurat5_integration_introduction.csv')
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.1.1 ggplot2_3.4.0 patchwork_1.1.2
## [4] pbmc3k.SeuratData_3.1.4 panc8.SeuratData_3.0.2 ifnb.SeuratData_3.1.0
## [7] SeuratData_0.2.2 Seurat_4.9.9.9020 SeuratObject_4.9.9.9053
## [10] sp_1.5-1
##
## loaded via a namespace (and not attached):
## [1] spam_2.9-1 sn_2.1.0 plyr_1.8.8
## [4] igraph_1.3.5 lazyeval_0.2.2 splines_4.2.2
## [7] RcppHNSW_0.4.1 listenv_0.9.0 scattermore_0.8
## [10] qqconf_1.3.0 TH.data_1.1-1 digest_0.6.31
## [13] htmltools_0.5.4 fansi_1.0.3 magrittr_2.0.3
## [16] tensor_1.5 cluster_2.1.4 ROCR_1.0-11
## [19] globals_0.16.2 matrixStats_0.63.0 sandwich_3.0-2
## [22] spatstat.sparse_3.0-0 colorspace_2.0-3 rappdirs_0.3.3
## [25] ggrepel_0.9.2 rbibutils_2.2.9 xfun_0.36
## [28] dplyr_1.0.10 crayon_1.5.2 jsonlite_1.8.4
## [31] progressr_0.13.0 spatstat.data_3.0-0 survival_3.4-0
## [34] zoo_1.8-11 glue_1.6.2 polyclip_1.10-4
## [37] gtable_0.3.1 leiden_0.4.3 DelayedArray_0.22.0
## [40] future.apply_1.10.0 BiocGenerics_0.44.0 abind_1.4-5
## [43] scales_1.2.1 mvtnorm_1.1-3 DBI_1.1.2
## [46] spatstat.random_3.0-1 miniUI_0.1.1.1 Rcpp_1.0.9
## [49] plotrix_3.8-2 metap_1.8 viridisLite_0.4.1
## [52] xtable_1.8-4 reticulate_1.27 dotCall64_1.0-2
## [55] stats4_4.2.2 htmlwidgets_1.6.1 httr_1.4.4
## [58] RColorBrewer_1.1-3 TFisher_0.2.0 ellipsis_0.3.2
## [61] ica_1.0-3 pkgconfig_2.0.3 farver_2.1.1
## [64] sass_0.4.4 uwot_0.1.14 deldir_1.0-6
## [67] utf8_1.2.2 tidyselect_1.2.0 labeling_0.4.2
## [70] rlang_1.0.6 reshape2_1.4.4 later_1.3.0
## [73] munsell_0.5.0 tools_4.2.2 cachem_1.0.6
## [76] cli_3.6.0 generics_0.1.3 mathjaxr_1.6-0
## [79] ggridges_0.5.4 evaluate_0.19 stringr_1.5.0
## [82] fastmap_1.1.0 yaml_2.3.6 goftest_1.2-3
## [85] knitr_1.41 fitdistrplus_1.1-8 purrr_1.0.1
## [88] RANN_2.6.1 pbapply_1.6-0 future_1.30.0
## [91] nlme_3.1-161 mime_0.12 formatR_1.12
## [94] compiler_4.2.2 rstudioapi_0.14 plotly_4.10.1
## [97] png_0.1-8 spatstat.utils_3.0-1 tibble_3.1.8
## [100] bslib_0.4.2 stringi_1.7.12 highr_0.10
## [103] RSpectra_0.16-1 lattice_0.20-45 Matrix_1.5-1
## [106] multtest_2.52.0 vctrs_0.5.1 mutoss_0.1-12
## [109] pillar_1.8.1 lifecycle_1.0.3 spatstat.geom_3.0-3
## [112] Rdpack_2.4 lmtest_0.9-40 jquerylib_0.1.4
## [115] RcppAnnoy_0.0.20 data.table_1.14.6 irlba_2.3.5.1
## [118] httpuv_1.6.7 R6_2.5.1 promises_1.2.0.1
## [121] KernSmooth_2.23-20 gridExtra_2.3 IRanges_2.32.0
## [124] parallelly_1.34.0 codetools_0.2-18 fastDummies_1.6.3
## [127] MASS_7.3-58 assertthat_0.2.1 withr_2.5.0
## [130] presto_1.0.0 mnormt_2.1.1 sctransform_0.3.5
## [133] S4Vectors_0.36.0 multcomp_1.4-20 parallel_4.2.2
## [136] grid_4.2.2 tidyr_1.2.1 rmarkdown_2.19
## [139] MatrixGenerics_1.8.1 Rtsne_0.16 spatstat.explore_3.0-5
## [142] Biobase_2.56.0 numDeriv_2016.8-1.1 shiny_1.7.4